{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {},
   "outputs": [],
   "source": [
    "import os\n",
    "os.sys.path.append(os.path.dirname(os.path.abspath('.')))"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## 数据准备"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {},
   "outputs": [],
   "source": [
    "import numpy as np\n",
    "\n",
    "data = np.array([[2.5, 3.5, 3, 3.5, 2.5, 3],\n",
    "                 [3, 3.5, 1.5, 5, 3.5, 3],\n",
    "                 [2.5, 3, 0, 3.5, 0, 4],\n",
    "                 [0, 3.5, 3, 0, 4, 4],\n",
    "                 [3, 4, 2, 3, 2, 3],\n",
    "                 [3, 4, 0, 5, 3.5, 3],\n",
    "                 [0, 4.5, 0, 4, 1, 0]])\n",
    "n_users, n_items = data.shape"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "有了user-item数据后，可以计算两两user之间的相似度："
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {},
   "outputs": [],
   "source": [
    "from metrics.pairwise.euclidean_distances import euclidean_distances\n",
    "\n",
    "dist_mat=euclidean_distances(data)    # 两两用户之间的距离矩阵\n",
    "sim_mat=1/(1+dist_mat)    # 将距离转化成相似度"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "指定一个用户$user_{i}$，首先找到跟其最相似的前$k$个用户："
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {},
   "outputs": [],
   "source": [
    "i = 6    # 最后一个用户\n",
    "k = 3    # 使用最相似的前3个用户\n",
    "top_k_sim = sim_mat[i][sim_mat[i] != 1].argsort(\n",
    ")[-1:-k-1:-1]    # 首先排除相似度为1的用户，然后取前k个最相似的用户"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "推荐的本质就是为用户推荐其未曾见过或用过的东西，所以找出指定用户未评分的物品，然后计算相似用户对该物品的加权评分："
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "[2.834951   2.         3.33009799]\n",
      "[3.33009799 2.834951   2.        ] [5 0 2]\n"
     ]
    }
   ],
   "source": [
    "cand_items_mask = (data[i] == 0)    # 提取未评价物品的布尔索引\n",
    "cand_items = np.arange(len(data[i]))[cand_items_mask]    # 候选推荐物品的索引\n",
    "\n",
    "# 相似用户对候选物品的评分矩阵，形状为(top_users,cand_items)\n",
    "scores = data[top_k_sim, :][:, cand_items_mask]\n",
    "# 对已评分用户相似度的求和，作为分母\n",
    "denominator = np.sum(\n",
    "    (scores != 0)*sim_mat[i, top_k_sim].reshape(-1, 1), axis=0)\n",
    "\n",
    "scores = np.sum(\n",
    "    scores * sim_mat[i, top_k_sim].reshape(-1, 1), axis=0)    # 以相似度加权并求和\n",
    "scores = scores/denominator    # 除以相似度的累加\n",
    "\n",
    "idx = np.argsort(scores)[::-1]    # 按分数排序后的索引\n",
    "scores = scores[idx]\n",
    "cand_items = cand_items[idx]\n",
    "\n",
    "print(scores, cand_items)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "封装测试："
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "[(5, 3.3300979931640846), (0, 2.834951003417958), (2, 2.0)]"
      ]
     },
     "execution_count": 6,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "def CF(data, i, k=5):\n",
    "    '''\n",
    "    i: 用户idx\n",
    "    k: 使用前k个最相似的用户\n",
    "    '''\n",
    "    dist_mat = euclidean_distances(data)    # 两两row之间的距离矩阵\n",
    "    sim_mat = 1/(1+dist_mat)    # 将距离转化成相似度\n",
    "\n",
    "    top_k_sim = sim_mat[i][sim_mat[i] != 1].argsort()[-1:-k-1:-1]\n",
    "\n",
    "    cand_items_msak = (data[i] == 0)\n",
    "    cand_items = np.arange(len(data[i]))[cand_items_msak]\n",
    "\n",
    "    # 相似用户对候选物品的评分矩阵，形状为(top_users,cand_items)\n",
    "    scores = data[top_k_sim, :][:, cand_items_msak]\n",
    "    # 对已评分用户相似度的求和，作为分母\n",
    "    denominator = np.sum(\n",
    "        (scores != 0)*sim_mat[i, top_k_sim].reshape(-1, 1), axis=0)\n",
    "\n",
    "    scores = np.sum(\n",
    "        scores * sim_mat[i, top_k_sim].reshape(-1, 1), axis=0)    # 以相似度加权并求和\n",
    "    scores = scores/denominator    # 除以相似度的累加\n",
    "\n",
    "    idx = np.argsort(scores)[::-1]    # 按分数排序后的索引\n",
    "    scores = scores[idx]\n",
    "    cand_items = cand_items[idx]\n",
    "\n",
    "    return [(item, score) for item, score in zip(cand_items, scores)]\n",
    "\n",
    "\n",
    "CF(data, 6, 3)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "如果需要针对物品推荐用户，将data矩阵转置即可。"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "[(6, 4.0), (5, 3.7848875039392977), (2, 2.8924437519696484)]"
      ]
     },
     "execution_count": 7,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "data_T = data.T\n",
    "CF(data_T, 2, 2)"
   ]
  }
 ],
 "metadata": {
  "hide_input": false,
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.6.6"
  },
  "toc": {
   "base_numbering": 1,
   "nav_menu": {},
   "number_sections": true,
   "sideBar": true,
   "skip_h1_title": false,
   "title_cell": "Table of Contents",
   "title_sidebar": "Contents",
   "toc_cell": false,
   "toc_position": {},
   "toc_section_display": true,
   "toc_window_display": false
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}
